library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v readr   1.3.1
## v tibble  1.4.2     v purrr   0.2.5
## v tidyr   0.8.2     v stringr 1.3.1
## v ggplot2 3.1.0     v forcats 0.4.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(cluster)
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
domestict20batting <- read.csv("D:\\Vishal\\III year\\Data Analytics\\Assignment II\\Player Ratings\\domestict20careerbattingrating_mod.csv")

head(domestict20batting)
##          Name Matches Innings Not_Outs Runs High_Score Average No_Of_100
## 1   C H Gayle     256     251       34 9120        175   42.02        17
## 2     V Kohli     204     192       38 6446        113   41.85         4
## 3   B J Hodge     256     242       55 6997        106   37.41         2
## 4  D A Warner     220     219       22 6868        135   34.86         5
## 5 K A Pollard     312     279       84 6095         89   31.25         0
## 6    S K Rai0     242     228       37 6326        109   33.12         3
##   No_Of_50 Strike_Rate Catches_Taken Runs.Innings    Rating ScaledRating
## 1       57      150.34            64     36.33466 188036.28        20.82
## 2       46      133.01            89     33.57292 121996.95        18.69
## 3       46      131.72            96     28.91322 116714.79        18.48
## 4       54      143.68            96     31.36073 110581.51        18.24
## 5       30      152.71           177     21.84588 104085.82        17.96
## 6       36      139.00           124     27.74561  98915.61        17.73
##   LogRating Stumpings
## 1  5.274242        NA
## 2  5.086349        NA
## 3  5.067126        NA
## 4  5.043683        NA
## 5  5.017392        NA
## 6  4.995265        NA
summary(domestict20batting)
##                Name        Matches          Innings          Not_Outs    
##  A Singh         :  2   Min.   :  1.00   Min.   :  0.00   Min.   : 0.00  
##  S Sharma        :  2   1st Qu.: 30.00   1st Qu.: 17.00   1st Qu.: 4.00  
##  Yuvraj Singh    :  2   Median : 60.00   Median : 38.00   Median : 9.00  
##  A A Bilakhia    :  1   Mean   : 80.75   Mean   : 59.58   Mean   :13.28  
##  A A Chavan      :  1   3rd Qu.:119.00   3rd Qu.: 80.00   3rd Qu.:18.00  
##  A A Jhunjhunwala:  1   Max.   :312.00   Max.   :279.00   Max.   :86.00  
##  (Other)         :477                                                    
##       Runs          High_Score        Average        No_Of_100      
##  Min.   :   0.0   Min.   :  0.00   Min.   : 0.00   Min.   : 0.0000  
##  1st Qu.: 102.2   1st Qu.: 22.00   1st Qu.:10.57   1st Qu.: 0.0000  
##  Median : 476.0   Median : 56.50   Median :19.52   Median : 0.0000  
##  Mean   :1156.4   Mean   : 57.17   Mean   :18.52   Mean   : 0.3025  
##  3rd Qu.:1394.5   3rd Qu.: 86.00   3rd Qu.:25.29   3rd Qu.: 0.0000  
##  Max.   :9120.0   Max.   :175.00   Max.   :56.00   Max.   :17.0000  
##                                                                     
##     No_Of_50       Strike_Rate    Catches_Taken     Runs.Innings   
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:102.3   1st Qu.:  8.00   1st Qu.: 5.609  
##  Median : 1.000   Median :119.0   Median : 20.00   Median :14.171  
##  Mean   : 5.527   Mean   :113.1   Mean   : 27.87   Mean   :14.049  
##  3rd Qu.: 6.000   3rd Qu.:129.1   3rd Qu.: 38.00   3rd Qu.:21.298  
##  Max.   :57.000   Max.   :190.9   Max.   :177.00   Max.   :36.335  
##                                                                    
##      Rating          ScaledRating      LogRating        Stumpings    
##  Min.   :     0.0   Min.   : 0.000   Min.   :-1.148   Min.   : 1.00  
##  1st Qu.:   863.3   1st Qu.: 5.420   1st Qu.: 2.936   1st Qu.: 4.00  
##  Median :  4908.2   Median : 8.370   Median : 3.691   Median : 9.50  
##  Mean   : 14462.3   Mean   : 8.559   Mean   : 3.461   Mean   :13.65  
##  3rd Qu.: 15490.2   3rd Qu.:11.158   3rd Qu.: 4.190   3rd Qu.:16.00  
##  Max.   :188036.3   Max.   :20.820   Max.   : 5.274   Max.   :77.00  
##                                                       NA's   :432
set.seed(20)

domesticBatCluster <- kmeans(domestict20batting[, 2:15], 3)

domesticBatCluster$cluster <- as.factor(domesticBatCluster$cluster)
#domesticBatCluster$cluster

ggplot(domestict20batting, aes(Innings, Runs/Innings, color = domesticBatCluster$cluster)) +
  geom_point(size = 2) +
  scale_color_hue(labels = c("Best players", "Good players", "Bad Players")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Domestic T20 Batting Average")
## Warning: Removed 3 rows containing missing values (geom_point).

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(domestict20batting, x = ~Innings, y = ~Runs/Innings, type = 'scatter', 
             mode = 'markers', color = domesticBatCluster$cluster, 
             text = ~paste('Name: ', Name)) %>%
  layout(title = "Cluster of Averages (batsmen)")

p
## Warning: Ignoring 3 observations
head(domestict20batting)
##          Name Matches Innings Not_Outs Runs High_Score Average No_Of_100
## 1   C H Gayle     256     251       34 9120        175   42.02        17
## 2     V Kohli     204     192       38 6446        113   41.85         4
## 3   B J Hodge     256     242       55 6997        106   37.41         2
## 4  D A Warner     220     219       22 6868        135   34.86         5
## 5 K A Pollard     312     279       84 6095         89   31.25         0
## 6    S K Rai0     242     228       37 6326        109   33.12         3
##   No_Of_50 Strike_Rate Catches_Taken Runs.Innings    Rating ScaledRating
## 1       57      150.34            64     36.33466 188036.28        20.82
## 2       46      133.01            89     33.57292 121996.95        18.69
## 3       46      131.72            96     28.91322 116714.79        18.48
## 4       54      143.68            96     31.36073 110581.51        18.24
## 5       30      152.71           177     21.84588 104085.82        17.96
## 6       36      139.00           124     27.74561  98915.61        17.73
##   LogRating Stumpings
## 1  5.274242        NA
## 2  5.086349        NA
## 3  5.067126        NA
## 4  5.043683        NA
## 5  5.017392        NA
## 6  4.995265        NA
df <- select(domestict20batting, Average, Strike_Rate, Runs.Innings)
domesticBatCluster1 <- kmeans(df, 3)
domesticBatCluster1$cluster <- as.factor(domesticBatCluster1$cluster)

q <- plot_ly(domestict20batting, x = ~Matches, y = ~Strike_Rate, type = 'scatter',
             mode = 'markers', color = domesticBatCluster1$cluster,
             text = ~paste('Name: ', Name)) %>%
  layout(title = "Cluster of matches vs Strike Rate")
q
str(domestict20batting)
## 'data.frame':    486 obs. of  16 variables:
##  $ Name         : Factor w/ 483 levels "A A Bilakhia",..: 84 458 70 100 193 388 433 62 333 271 ...
##  $ Matches      : int  256 204 256 220 312 242 233 236 234 240 ...
##  $ Innings      : int  251 192 242 219 279 228 221 233 223 216 ...
##  $ Not_Outs     : int  34 38 55 22 84 37 63 22 38 86 ...
##  $ Runs         : int  9120 6446 6997 6868 6095 6326 5995 6671 6137 4884 ...
##  $ High_Score   : int  175 113 106 135 89 109 95 158 109 73 ...
##  $ Average      : num  42 41.9 37.4 34.9 31.2 ...
##  $ No_Of_100    : int  17 4 2 5 0 3 0 7 3 0 ...
##  $ No_Of_50     : int  57 46 46 54 30 36 37 32 42 18 ...
##  $ Strike_Rate  : num  150 133 132 144 153 ...
##  $ Catches_Taken: int  64 89 96 96 177 124 92 96 91 122 ...
##  $ Runs.Innings : num  36.3 33.6 28.9 31.4 21.8 ...
##  $ Rating       : num  188036 121997 116715 110582 104086 ...
##  $ ScaledRating : num  20.8 18.7 18.5 18.2 18 ...
##  $ LogRating    : num  5.27 5.09 5.07 5.04 5.02 ...
##  $ Stumpings    : int  NA NA NA NA NA NA NA 14 NA 59 ...
any(is.na(domestict20batting))
## [1] TRUE
domestict20batting_label <- domestict20batting$Stumpings
domestict20batting$Stumpings <- NULL
str(domestict20batting)
## 'data.frame':    486 obs. of  15 variables:
##  $ Name         : Factor w/ 483 levels "A A Bilakhia",..: 84 458 70 100 193 388 433 62 333 271 ...
##  $ Matches      : int  256 204 256 220 312 242 233 236 234 240 ...
##  $ Innings      : int  251 192 242 219 279 228 221 233 223 216 ...
##  $ Not_Outs     : int  34 38 55 22 84 37 63 22 38 86 ...
##  $ Runs         : int  9120 6446 6997 6868 6095 6326 5995 6671 6137 4884 ...
##  $ High_Score   : int  175 113 106 135 89 109 95 158 109 73 ...
##  $ Average      : num  42 41.9 37.4 34.9 31.2 ...
##  $ No_Of_100    : int  17 4 2 5 0 3 0 7 3 0 ...
##  $ No_Of_50     : int  57 46 46 54 30 36 37 32 42 18 ...
##  $ Strike_Rate  : num  150 133 132 144 153 ...
##  $ Catches_Taken: int  64 89 96 96 177 124 92 96 91 122 ...
##  $ Runs.Innings : num  36.3 33.6 28.9 31.4 21.8 ...
##  $ Rating       : num  188036 121997 116715 110582 104086 ...
##  $ ScaledRating : num  20.8 18.7 18.5 18.2 18 ...
##  $ LogRating    : num  5.27 5.09 5.07 5.04 5.02 ...
domestict20batting_label <- domestict20batting$Name
domestict20batting$Name <- NULL
str(domestict20batting)
## 'data.frame':    486 obs. of  14 variables:
##  $ Matches      : int  256 204 256 220 312 242 233 236 234 240 ...
##  $ Innings      : int  251 192 242 219 279 228 221 233 223 216 ...
##  $ Not_Outs     : int  34 38 55 22 84 37 63 22 38 86 ...
##  $ Runs         : int  9120 6446 6997 6868 6095 6326 5995 6671 6137 4884 ...
##  $ High_Score   : int  175 113 106 135 89 109 95 158 109 73 ...
##  $ Average      : num  42 41.9 37.4 34.9 31.2 ...
##  $ No_Of_100    : int  17 4 2 5 0 3 0 7 3 0 ...
##  $ No_Of_50     : int  57 46 46 54 30 36 37 32 42 18 ...
##  $ Strike_Rate  : num  150 133 132 144 153 ...
##  $ Catches_Taken: int  64 89 96 96 177 124 92 96 91 122 ...
##  $ Runs.Innings : num  36.3 33.6 28.9 31.4 21.8 ...
##  $ Rating       : num  188036 121997 116715 110582 104086 ...
##  $ ScaledRating : num  20.8 18.7 18.5 18.2 18 ...
##  $ LogRating    : num  5.27 5.09 5.07 5.04 5.02 ...
domestict20batting_sc <- as.data.frame(scale(domestict20batting))
summary(domestict20batting_sc)
##     Matches           Innings           Not_Outs            Runs        
##  Min.   :-1.2271   Min.   :-0.9941   Min.   :-0.9950   Min.   :-0.7375  
##  1st Qu.:-0.7809   1st Qu.:-0.7104   1st Qu.:-0.6952   1st Qu.:-0.6723  
##  Median :-0.3193   Median :-0.3600   Median :-0.3206   Median :-0.4339  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.5886   3rd Qu.: 0.3408   3rd Qu.: 0.3539   3rd Qu.: 0.1519  
##  Max.   : 3.5582   Max.   : 3.6614   Max.   : 5.4494   Max.   : 5.0790  
##    High_Score          Average          No_Of_100          No_Of_50       
##  Min.   :-1.53913   Min.   :-1.9488   Min.   :-0.2746   Min.   :-0.57008  
##  1st Qu.:-0.94687   1st Qu.:-0.8360   1st Qu.:-0.2746   1st Qu.:-0.57008  
##  Median :-0.01811   Median : 0.1052   Median :-0.2746   Median :-0.46693  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.77604   3rd Qu.: 0.7128   3rd Qu.:-0.2746   3rd Qu.: 0.04882  
##  Max.   : 3.17198   Max.   : 3.9453   Max.   :15.1578   Max.   : 5.30946  
##   Strike_Rate      Catches_Taken      Runs.Innings          Rating       
##  Min.   :-4.0852   Min.   :-0.9939   Min.   :-1.61390   Min.   :-0.6247  
##  1st Qu.:-0.3883   1st Qu.:-0.7087   1st Qu.:-0.96955   1st Qu.:-0.5874  
##  Median : 0.2132   Median :-0.2808   Median : 0.01399   Median :-0.4127  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.5799   3rd Qu.: 0.3610   3rd Qu.: 0.83268   3rd Qu.: 0.0444  
##  Max.   : 2.8117   Max.   : 5.3174   Max.   : 2.56007   Max.   : 7.4973  
##   ScaledRating        LogRating      
##  Min.   :-2.03858   Min.   :-4.1949  
##  1st Qu.:-0.74769   1st Qu.:-0.4778  
##  Median :-0.04508   Median : 0.2091  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.61882   3rd Qu.: 0.6634  
##  Max.   : 2.92016   Max.   : 1.6501
dist_mat <- dist(domestict20batting_sc, method = 'euclidean')

hclust_avg <- hclust(dist_mat, method = 'average')
plot(hclust_avg)

cut_avg <- cutree(hclust_avg, k = 6)

plot(hclust_avg)
rect.hclust(hclust_avg , k = 6, border = 2:6)
abline(h = 3, col = 'red')

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.9.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)

domestict20batting_cl <- mutate(domestict20batting, cluster = cut_avg)
count(domestict20batting_cl, cluster)
## # A tibble: 6 x 2
##   cluster     n
##     <int> <int>
## 1       1     1
## 2       2     9
## 3       3    10
## 4       4    89
## 5       5   376
## 6       6     1
ggplot(domestict20batting_cl, aes(x=Matches, y = Runs, color = factor(cluster))) + geom_point()